Setup, load packages, import data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RSQLite)
library(ggplot2)
library(stringr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(forcats)
conn <- dbConnect(RSQLite::SQLite(), "data/spacemissions.sqlite")
dbExecute(conn, "UPDATE spacemissions SET Organisation = REPLACE(Organisation, 'Arm??e de l''Air', 'Armée de l''Air') WHERE Organisation LIKE '%Arm??e de l''Air%'")
## [1] 0
dbGetQuery(conn,"PRAGMA table_info(spacemissions)")
##   cid           name type notnull dflt_value pk
## 1   0   Organisation TEXT       0         NA  0
## 2   1       Location TEXT       0         NA  0
## 3   2           Date TEXT       0         NA  0
## 4   3         Detail TEXT       0         NA  0
## 5   4  Rocket_Status TEXT       0         NA  0
## 6   5          Price TEXT       0         NA  0
## 7   6 Mission_Status TEXT       0         NA  0
df <- dbGetQuery(conn, "SELECT * FROM spacemissions")

Bar charts & histograms

Launches per organisation

dbExecute(conn, "DROP TABLE IF EXISTS organisation_counts")
## [1] 0
dbExecute(conn, "
  CREATE TABLE organisation_counts (
    organisation VARCHAR(255),
    organisation_count INT
  );
")
## [1] 0
dbExecute(conn, "
  INSERT INTO organisation_counts (organisation, organisation_count)
  SELECT Organisation, COUNT(*) AS organisation_count
  FROM spacemissions
  GROUP BY Organisation;
")
## [1] 56
organisation_counts <- dbGetQuery(conn, "SELECT * FROM organisation_counts ORDER BY organisation_count ASC")

organisation_counts <- organisation_counts %>%
  mutate(organisation = factor(organisation, levels = organisation))

ggplot(organisation_counts, aes(x=organisation, y=organisation_count)) +
  geom_segment( aes(x=organisation, xend=organisation, y=0, yend=organisation_count), color="skyblue") +
  geom_point( color="blue", size=4, alpha=0.6) +
  theme_light() +
  coord_flip() +
  labs(
    x = "Organisation",
    y = "Number of missions",
    title = "Missions per organisation"
  ) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
  )

Rocket statuses

dbExecute(conn, "DROP TABLE IF EXISTS status_counts")
## [1] 0
dbExecute(conn, "
  CREATE TABLE status_counts (
    status VARCHAR(255),
    count INT
  );
")
## [1] 0
dbExecute(conn, "
  INSERT INTO status_counts (status, count)
  SELECT Rocket_Status, COUNT(*) AS count
  FROM spacemissions
  GROUP BY Rocket_Status;
")
## [1] 2
status_counts <- dbGetQuery(conn, "SELECT * FROM status_counts ORDER BY count ASC")

status_counts <- status_counts %>%
  mutate(status = recode(status,
                                "StatusActive" = "Active",
                                "StatusRetired" = "Retired"))

ggplot(status_counts, aes(x=status, y=count)) + 
  geom_bar(stat="identity", fill=rgb(0.1,0.4,0.5)) +
  labs(x="Rocket status",
       y="Number of rockets",
       title="Rocket status")

Mission costs histogram

mission_costs <- dbGetQuery(conn, "SELECT Price FROM spacemissions")

tidy_costs <- function(costs_vector) {
  costs_vector$Price <- as.numeric(gsub(",", "", costs_vector$Price))
  costs_vector <- na.omit(costs_vector)
  return(costs_vector)
}

mission_costs <- tidy_costs(mission_costs)

ggplot(mission_costs, aes(x=Price)) +
  geom_histogram(binwidth=100) +
  labs(x="Cost (millions of USD)",
       y="Count")

Choropleth maps

Setup countries shapefile

library(countrycode)
library(stringr)

# download countries geojson
tmp_geojson <- tempfile(fileext = ".geojson")
download.file(
  "https://raw.githubusercontent.com/datasets/geo-countries/cd9e0635901eac20294a57ee3b3ce0684d5e3f1a/data/countries.geojson",
  tmp_geojson
)

# read contents
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
countries_sf <- read_sf(tmp_geojson)

Fetch ISO3 codes for each location in dataframe

locations <- df$Location
pattern <- ".*,\\s*(.*)"
locations <- str_match(locations, pattern)[, 2]

country_mappings <- c(
  "Russia" = "RUS",
  "New Mexico" = "USA",
  "Yellow Sea" = "CHN",
  "Shahrud Missile Test Site" = "IRN",
  "Pacific Missile Range Facility" = "USA",
  "Barents Sea" = "RUS",
  "Gran Canaria" = "USA"
)

get_iso3_code <- function(country) {
  if (country %in% names(country_mappings)) {
    return(country_mappings[[country]])
  } else if (grepl("Pacific Ocean", country, ignore.case = TRUE)) {
    return(NA)
  } else {
    iso3_code <- countrycode(country, "country.name", "iso3c")
    # dynamically update country_mappings to run code faster
    country_mappings <<- c(country_mappings, setNames(iso3_code, country))
    return(countrycode(country, "country.name", "iso3c"))
  }
}

iso3_codes <- lapply(locations, get_iso3_code)

df$ISO_A3 <- iso3_codes

Successes choropleth

success_total <- df %>%
  filter(Mission_Status == "Success") %>%
  group_by(ISO_A3) %>%
  summarise(total_successes = n())

success_shapefile <- merge(countries_sf, success_total, by = "ISO_A3", all.x = TRUE)
#plot choropleth
ggplot(success_shapefile) +
  geom_sf(aes(fill = total_successes), linewidth = 0.1, alpha = 0.9) +
  theme_classic() +
  scale_fill_viridis_c(
    trans = "log", breaks = c(1, 5, 10, 100, 500, 1000),
    name = "Number of successful mission launches",
    guide = guide_legend(
      keyheight = unit(3, units = "mm"),
      keywidth = unit(12, units = "mm"),
      label.position = "bottom",
      title.position = "top",
      nrow = 1
    )
  ) +
  theme(legend.position = "bottom") +
  labs(
    title = "Successful space missions by country"
  )

Failure choropleth

failure_total <- df %>%
  filter(str_detect(Mission_Status, "Failure")) %>%
  group_by(ISO_A3) %>%
  summarise(total_failures = n())

failure_shapefile <- merge(countries_sf, failure_total, by = "ISO_A3", all.x = TRUE)
ggplot(failure_shapefile) +
  geom_sf(aes(fill = total_failures), linewidth = 0.1, alpha = 0.9) +
  theme_classic() +
  scale_fill_viridis_c(
    trans = "log", breaks = c(1, 5, 10, 50, 100, 150),
    name = "Number of failed mission launches",
    guide = guide_legend(
      keyheight = unit(3, units = "mm"),
      keywidth = unit(12, units = "mm"),
      label.position = "bottom",
      title.position = "top",
      nrow = 1
    )
  ) +
  theme(legend.position = "bottom") +
  labs(
    title = "Failed space missions by country"
  )

Plotly sunburst chart

get_values <- function(parents, labels) {
  if (length(parents) == 0 || parents == "") {
    return(sum(df$ISO_A3 == labels))
  } else if (grepl("-", parents)) {
    return(sum(df$ISO_A3 == sub("-.*", "", parents) & df$Organisation == sub(".*-", "", parents) & df$Mission_Status == labels))
  } else {
    return(sum(df$Organisation == labels & df$ISO_A3 == parents))
  }
}

get_country_data <- function(country) {
    organisations <- unique(subset(df, ISO_A3 == country, select = Organisation))$Organisation
    organisations <- paste0(country, "-", organisations)
    
    status_codes <- c("success", "fail", "partialfail", "prelaunchfail")
    
    organisation_codes <- expand.grid(organisation = organisations, item = status_codes)
    
    organisation_codes <- paste0(organisation_codes$organisation, "-", organisation_codes$item)
    
  temp_df <- data.frame(
    ids=c(country, organisations, organisation_codes)
  )
  
  temp_df$labels <- sub(".*-", "", temp_df$ids)
  
  temp_df <- temp_df %>%
        mutate(
            labels = case_when(
                grepl("prelaunchfail", ids, ignore.case = TRUE) ~ "Prelaunch Failure",
                grepl("success", ids, ignore.case = TRUE) ~ "Success",
                grepl("partialfail", ids, ignore.case = TRUE) ~ "Partial Failure",
                grepl("fail", ids, ignore.case = TRUE) ~ "Failure",
                TRUE ~ labels
            )
        )
  
temp_df$parents <- ifelse(
    str_count(temp_df$ids, "-") == 0,
    "",
    ifelse(
        str_count(temp_df$ids, "-") == 1,
        sub("-.*", "", temp_df$ids),
        sub("-(?!.*-).*", "", temp_df$ids, perl=TRUE)
    )
)

temp_df$values <- mapply(get_values, temp_df$parents, temp_df$labels)


return(temp_df)
}


valid_countries <- success_total$ISO_A3[!is.na(success_total$ISO_A3)]

sunburst_df <- do.call(rbind, lapply(valid_countries, get_country_data))

sunburst_df$values <- as.numeric(sunburst_df$values)

sunburst_df <- sunburst_df[sunburst_df$values != 0, ]
fig <- plot_ly()

fig <- fig %>% add_trace(
  type='sunburst',
  ids=sunburst_df$ids,
  labels=sunburst_df$labels,
  parents=sunburst_df$parents,
  values=sunburst_df$values,
  branchvalues='total',
  insidetextorientation='radial'
)
fig

Analysing spends

df$Price <- as.numeric(gsub(",", "", df$Price))

total_price_by_organisation <- df %>%
  group_by(Organisation) %>%
  summarise(Total_Price = sum(Price, na.rm = TRUE)) %>%
  filter(Total_Price!=0) %>%
  arrange(Total_Price)

total_price_by_organisation <- total_price_by_organisation %>%
  mutate(Organisation = factor(Organisation, levels = Organisation))

ggplot(total_price_by_organisation, aes(x=Organisation, y=Total_Price)) +
  geom_segment( aes(x=Organisation, xend=Organisation, y=0, yend=Total_Price), color="skyblue") +
  geom_point( color="blue", size=4, alpha=0.6) +
  theme_light() +
  coord_flip() +
  labs(
    x = "Organisation",
    y = "Total spend (millions of USD)",
    title = "Total expenditure on missions per organisation"
  ) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
  )

mean_price_by_organisation <- df %>%
  group_by(Organisation) %>%
  summarise(Mean_Price = mean(Price, na.rm = TRUE)) %>%
  arrange(Mean_Price)

mean_price_by_organisation <- na.omit(mean_price_by_organisation)

mean_price_by_organisation %>% 
  mutate(Organisation = fct_reorder(Organisation, Mean_Price)) %>%
  ggplot(aes(x=Organisation, y=Mean_Price)) + 
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Mean cost of mission per organisation",
    x = "Organisation",
    y = "Mean cost per mission (millions of USD)"
  )